#################################################################################
# Replication file for:                                                         #
# "Balancing Precision and Retention in Experimental Design"                    #
#                                                                               #
# Gustavo Diaz                                                                  #
# Northwestern University                                                       #
# gustavo.diaz@northwestern.edu                                                 #
#                                                                               #
# Erin L. Rossiter                                                              #
# University of Notre Dame                                                      #
# erossite@nd.edu                                                               #
#                                                                               #
# This file conducts simulation study with data from six published experiments' #
# replication archives.                                                         #
#################################################################################

# Source "designer" function
source("code/designer.R")

# Data -----

# Publicly available data downloaded from Harvard Dataverse. See the readme
# file for details.

# study 1
s1 = read_dta("data/external_data/Galasso_et_al.dta")

# study 2
load("data/external_data/Manekin_Mitts.rdata")
s2 = us_survey_wave2
rm(us_survey_wave2)

# study 3
s3 = read.csv("data/external_data/Lyon.csv")

# study 4
s4 = read.csv("data/external_data/Goerger_et_al.csv")

# study 5
load("data/external_data/Curiel_et_al.rdata")
s5 = imputed
rm(imputed)

# study 6
s6 = read_dta("data/external_data/Simas.dta")


# Subset and clean -----
# Subset to experimental variables and covariates in pre-registration file

# Study 1
df1 = s1 %>% 
  filter(campaign != "Aggressive" & candidates_no == 2) %>% 
  mutate(Z = ifelse(campaign == "Negative", 1, 0)) %>% 
  select(Y = votebaldi, # outcome
         Z, # treatment
         Y0 = sub_riskaversion, # pre-treatment outcome proxy
         # covariates
         sub_trustdummy, sub_male, sub_age, sub_highschool, sub_south,
         sub_bigtown, sub_left
  ) %>%
  drop_na # drop incomplete cases, if any

# Remove single label
attr(df1$sub_age, "label") <- NULL 

# Study 2
df2 = s2 %>% 
  # Redo factors so that they don't have too many levels
  mutate(
    white = ifelse(race == "White / Caucasian", 1, 0)
  ) %>% 
  select(Y = police_action_required, # outcome 11 pt scale
         Z = black, # treatment
         # covariates
         pol_views, white, partyID, female # DEVIATION FROM PREREG BECAUSE TOO MANY CATEGORIES
  ) %>% 
  drop_na # drop incomplete cases, if any

# Study 3
# IMPORTANT DEVIATION
# We could not tell what category is represented by each number
# so we just grouped them in arbitrarily
df3 = s3 %>%
  filter(treatment != "western" &
           education < 999) %>% 
  mutate(Z = ifelse(treatment == "african", 1, 0),
         rel = ifelse(religion < 5, 1, 0), # There is a gap between 2 and 5
         edu = ifelse(education < 4, 1, 0), # Edu is plausibly incremental so we pick the median value
         mar = ifelse(marital == 2, 1, 0), # Maybe 2 is married and everything else is non-married
         gen = gender - 1) %>% 
  select(Y = ss_rights_index, # outcome
         Z, # treatment
         # covariates
         african_welcome_homosexuals, rel, gen, edu, mar
  ) %>% 
  drop_na # drop incomplete cases, if any

# Study 4
df4 = s4 %>%
  filter(treat <= 2) %>% 
  mutate(Z = ifelse(treat == 2, 1, 0)) %>% 
  select(Y = yes, # outcome
         Z, # treatment
         # covariates
         actual_all_crimes5yr, officers_assaulted5yr,
         tot_clr_murder5yr, tot_clr_index_violent5yr,
         tot_clr_index_property5yr
  ) %>% 
  # coarsening for speed
  mutate(actual_all_crimes5yr = as.character(cut(actual_all_crimes5yr, breaks = quantile(actual_all_crimes5yr), include.lowest = TRUE)),
         officers_assaulted5yr = ifelse(officers_assaulted5yr > 0, "1", "0"),
         tot_clr_murder5yr = ifelse(tot_clr_murder5yr > 0, "1", "0"),
         tot_clr_index_violent5yr = as.character(cut(tot_clr_index_violent5yr, breaks = quantile(tot_clr_index_violent5yr), include.lowest = TRUE)),
         tot_clr_index_property5yr = as.character(cut(tot_clr_index_property5yr, breaks = quantile(tot_clr_index_property5yr), include.lowest = TRUE))) %>%
  drop_na # drop incomplete cases, if any

# Study 5
df5 = s5 %>%
  select(Y = trust_score, # outcome
         Z = assignment, # treatment
         # covariates
         conflict_score, voted2016, age, schooling, gender, blanco, negro, indigena
  ) %>% 
  drop_na # drop incomplete cases, if any

# Study 6
df6 = s6 %>%
  select(Y = relative_traits, # outcome
         Z = threat_treat, # treatment
         # covariates
         relative_fave, high_sj, sameparty
  ) %>% 
  drop_na # drop incomplete cases, if any


# Designs -----
designs_study1 <- expand_design(expand = T,
                                designer = designer,
                                x = list(df1),
                                N = nrow(df1),
                                drop = seq(0, .5, by = 0.1),
                                Y = "Y",
                                corr = seq(.25, .75, by = .25),
                                u_m = 0,
                                u_sd = .1,
                                Y_binary = TRUE,
                                block_many_vars = list(c("sub_trustdummy", "sub_male", 
                                                    "sub_age", "sub_highschool", 
                                                    "sub_south", "sub_bigtown",
                                                    "sub_left")),
                                multiblock = "continuous")
designs_study2 <- expand_design(expand = T,
                                designer = designer,
                                x = list(df2),
                                N = nrow(df2),
                                drop = seq(0, .5, by = 0.1),
                                Y = "Y",
                                corr = seq(.25, .75, by = .25),
                                u_m = 0,
                                u_sd = 1,
                                Y_binary = FALSE,
                                block_many_vars = list(c("white", "partyID", "female")),
                                multiblock = "discrete")
designs_study3 <- expand_design(expand = T,
                                designer = designer,
                                x = list(df3),
                                N = nrow(df3),
                                drop = seq(0, .5, by = 0.1),
                                Y = "Y",
                                corr = seq(.25, .75, by = .25),
                                u_m = 0,
                                u_sd = .1,
                                Y_binary = FALSE,
                                block_many_vars = list( c("rel", "gen", "edu", "mar")),
                                multiblock = "discrete")
designs_study4 <- expand_design(expand = T,
                                designer = designer,
                                x = list(df4),
                                N = nrow(df4),
                                drop = seq(0, .5, by = 0.1),
                                Y = "Y",
                                corr = seq(.25, .75, by = .25),
                                u_m = 0,
                                u_sd = .1,
                                Y_binary = TRUE,
                                block_many_vars = list( c("actual_all_crimes5yr", "officers_assaulted5yr",
                                                          "tot_clr_murder5yr", "tot_clr_index_violent5yr",
                                                          "tot_clr_index_property5yr")),
                                multiblock = "discrete")

designs_study5 <- expand_design(expand = T,
                                designer = designer,
                                x = list(df5),
                                N = nrow(df5),
                                drop = seq(0, .5, by = 0.1),
                                Y = "Y",
                                corr = seq(.25, .75, by = .25),
                                u_m = 0,
                                u_sd = .5,
                                Y_binary = FALSE,
                                block_many_vars = list(c("voted2016", "age", "schooling",
                                        "gender", "blanco", 
                                        "negro", "indigena")),
                                multiblock = "continuous")
designs_study6 <- expand_design(expand = T,
                                designer = designer,
                                x = list(df6),
                                N = nrow(df6),
                                drop = seq(0, .5, by = 0.1),
                                Y = "Y",
                                corr = seq(.25, .75, by = .25),
                                u_m = 0,
                                u_sd = 1,
                                Y_binary = FALSE,
                                block_many_vars = list(c("high_sj","sameparty")),
                                multiblock = "discrete")

# set up our own simulation because DeclareDesign is actually slower
combos <- expand.grid(drop = seq(0, .5, by = 0.1), corr = seq(.25, .75, by = .25))
combos <- combos %>%
  slice(rep(1:n(), each = 6))

my_sim_function <- function(id, d){
  ests <- draw_estimates(d)
  ests <- ests %>%
    mutate(sim_ID = id,
           drop = combos$drop,
           corr = combos$corr) %>%
    select(-term, -statistic, -p.value, -conf.low,  -conf.high, -df, -outcome, -inquiry)
  return(ests)
}

sims <- 1000

set.seed(1234)
out_study2 <- plyr::adply(.data = 1:sims,.margins = 1, .fun =  my_sim_function, d = designs_study2, .id = NULL, .parallel = F)
saveRDS(out_study2, file = "data/processed_data/diagnosis_study2.RDS")

out_study5 <- plyr::adply(.data = 1:sims,.margins = 1, .fun =  my_sim_function, d = designs_study5, .id = NULL, .parallel = F)
saveRDS(out_study5, file = "data/processed_data/diagnosis_study5.RDS")

out_study6 <- plyr::adply(.data = 1:sims,.margins = 1, .fun =  my_sim_function, d = designs_study6, .id = NULL, .parallel = F)
saveRDS(out_study6, file = "data/processed_data/diagnosis_study6.RDS")

out_study3 <- plyr::adply(.data = 1:sims,.margins = 1, .fun =  my_sim_function, d = designs_study3, .id = NULL, .parallel = F)
saveRDS(out_study3, file = "data/processed_data/diagnosis_study3.RDS")

out_study1 <- plyr::adply(.data = 1:sims,.margins = 1, .fun =  my_sim_function, d = designs_study1, .id = NULL, .parallel = F)
saveRDS(out_study1, file = "data/processed_data/diagnosis_study1.RDS")

out_study4 <- plyr::adply(.data = 1:sims,.margins = 1, .fun =  my_sim_function, d = designs_study4, .id = NULL, .parallel = F)
saveRDS(out_study4, file = "data/processed_data/diagnosis_study4.RDS")